package edu.stanford.nlp.semparse.open.model.tree; import java.util.List; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import edu.stanford.nlp.semparse.open.dataset.ExampleCached; import edu.stanford.nlp.semparse.open.dataset.Example; import edu.stanford.nlp.semparse.open.ling.LingUtils; import edu.stanford.nlp.semparse.open.util.SearchResult; import edu.stanford.nlp.semparse.open.util.WebUtils; import fig.basic.LogInfo; import fig.basic.Option; /** * Compute the knowledge tree for the given example by doing web searches or following links. */ public class KnowledgeTreeBuilder { public static class Options { @Option(gloss = "Maximum number of search results to keep per query") public int maxResultsPerSearch = 1; @Option public int maxFullTextLength = 140; @Option public boolean ignoreTextNodes = false; @Option public boolean useWikipedia = false; @Option public boolean useFakeGoogle = false; @Option(gloss = "level of entity string normalization when creating the knowledge tree " + "(0 = none / 1 = whitespace / 2 = simple / 3 = aggressive)") public int earlyNormalizeEntities = 1; @Option public boolean alsoNormalizeBR = true; @Option public boolean onlyNormalizeBR = false; } public static Options opts = new Options(); public void buildKnowledgeTree(Example ex) { LogInfo.begin_track("KnowledgeTreeBuilder %s", ex); if (ex instanceof ExampleCached) { ExampleCached cex = (ExampleCached) ex; if (cex.hashcode != null && cex.cacheDirectory != null) { // Use the cached web page LogInfo.begin_track("[CACHED %s]", cex.hashcode); ex.tree = new KNode(null, KNode.Type.QUERY, cex.phrase); Document doc = WebUtils.getWebpageFromHashcode(cex.cacheDirectory, cex.hashcode); buildKnowledgeTreeFromDocument(doc, ex.tree.createChild(KNode.Type.URL, cex.url)); LogInfo.end_track(); } else { // Download from the Internet LogInfo.begin_track("[URL %s]", cex.url); ex.tree = new KNode(null, KNode.Type.QUERY, cex.phrase); Document doc = WebUtils.getWebpage(cex.url); buildKnowledgeTreeFromDocument(doc, ex.tree.createChild(KNode.Type.URL, cex.url)); LogInfo.end_track(); } } else if (opts.useWikipedia) { // Cheat on Wikipedia String query = ex.phrase; String url = "http://en.wikipedia.org/wiki/" + query.replaceAll(" ", "_"); LogInfo.begin_track("[WIKIPEDIA %s]", url); ex.tree = new KNode(null, KNode.Type.QUERY, query); Document doc = WebUtils.getWebpage(url); buildKnowledgeTreeFromDocument(doc, ex.tree.createChild(KNode.Type.URL, url)); LogInfo.end_track(); } else { // Perform a Google search String query = "list of " + ex.phrase; List<SearchResult> results; if (opts.useFakeGoogle) { results = WebUtils.fakeGoogleSearch(query); } else { results = WebUtils.googleSearch(query); } if (results.size() > opts.maxResultsPerSearch) results = results.subList(0, opts.maxResultsPerSearch); LogInfo.logs("%s search results", results.size()); // Fetch the web pages of all the top pages. ex.tree = new KNode(null, KNode.Type.QUERY, query); for (SearchResult result : results) { LogInfo.begin_track("%s", result); Document doc = WebUtils.getWebpage(result.url); buildKnowledgeTreeFromDocument(doc, ex.tree.createChild(KNode.Type.URL, result.url)); LogInfo.end_track(); } } LogInfo.end_track(); } /** * Build a knowledge tree from jsoup Document object and attach the result to |root|. * @param doc The jsoup Document. The first child of |doc| should be an <html> tag. * @param root The parent of the created tree's root node. */ public void buildKnowledgeTreeFromDocument(Document doc, KNode root) { if (!doc.child(0).tagName().equals("html")) { LogInfo.fail(doc.child(0).tagName()); } HTMLFixer fixer = new HTMLFixer(doc); fixer.fixAllTables(); if (!opts.onlyNormalizeBR) { convertElementToKTree(doc.child(0), root); } if (opts.alsoNormalizeBR) { fixer.fixAllBRs(); convertElementToKTree(doc.child(0), root); } for (KNode htmlNode : root.getChildren()) { htmlNode.generateTimestamp(); } } /** * Convert jsoup Element (= an HTML tag and its content) into a knowledge tree. * Contents inside style tag (CSS) and script tag (JavaScript) are ignored. * * @param elt The jsoup Element corresponding to the root of the tree * @param parent The parent of the created tree's root node. */ public void convertElementToKTree(Element elt, KNode parent) { String eltText = LingUtils.normalize(elt.text(), opts.earlyNormalizeEntities); KNode currentNode = parent.createChild(KNode.Type.TAG, elt.tagName(), eltText.length() > opts.maxFullTextLength ? null : eltText); // Add children for (Node child : elt.childNodes()) { if (child instanceof Element) { convertElementToKTree((Element) child, currentNode); } else if (child instanceof TextNode) { if (!opts.ignoreTextNodes) { String text = LingUtils.normalize(((TextNode) child).text(), opts.earlyNormalizeEntities); if (!text.isEmpty()) { //currentNode.createChild(KNode.Type.TEXT, text, text); currentNode.createChild(KNode.Type.TAG, "text", text.length() > opts.maxFullTextLength ? null : text); } } } } // Add attributes for (Attribute attr : elt.attributes()) { currentNode.createAttribute(attr.getKey(), attr.getValue()); } } // ============================================================ // Test Suite // ============================================================ public static void main(String[] args) { KnowledgeTreeBuilder builder = new KnowledgeTreeBuilder(); /*{ ExampleCached ex = new ExampleCached("snooker tournaments", "frozen.cache/wiki/", "e88739bb552c5abef23b24fbf6e2e911cd3d2bac", null, null); builder.buildKnowledgeTree(ex); KNodeUtils.printTree(ex.tree.getChildren().get(0).getChildren().get(0)); }*/ { ExampleCached ex = new ExampleCached("Slovenian film", "frozen.cache/wiki/", "62f4c2c17afec7e54d6362f280a1e6ab65444e73", null, null); builder.buildKnowledgeTree(ex); KNodeUtils.printTree(ex.tree); } /*{ ExampleCached ex = new ExampleCached("oxymorons", "frozen.cache/02/", "1bdfe640b5f5680f328e90a77136fd883a3b0105", null, null); builder.buildKnowledgeTree(ex); KNodeUtils.printTree(ex.tree.getChildren().get(0).getChildren().get(0)); }*/ } }